/*
 * All Video Processing kernels 
 * Copyright © <2010>, Intel Corporation.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
 * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * This file was originally licensed under the following license
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
 */

#ifndef COMMON_INC
#define COMMON_INC

// Module name: common.inc
//
// Common header file for all Video-Processing kernels
//

.default_execution_size (16)
.default_register_type  :ub

.reg_count_total        80
.reg_count_payload      4


//========== Common constants ==========

// Bit position constants 
#define BIT0    0x01
#define BIT1    0x02
#define BIT2    0x04
#define BIT3    0x08
#define BIT4    0x10
#define BIT5    0x20
#define BIT6    0x40
#define BIT7    0x80
#define BIT8    0x0100
#define BIT9    0x0200
#define BIT10   0x0400
#define BIT11   0x0800
#define BIT12   0x1000
#define BIT13   0x2000
#define BIT14   0x4000
#define BIT15   0x8000
#define BIT16   0x00010000
#define BIT17   0x00020000
#define BIT18   0x00040000
#define BIT19   0x00080000
#define BIT20   0x00100000
#define BIT21   0x00200000
#define BIT22   0x00400000
#define BIT23   0x00800000
#define BIT24   0x01000000
#define BIT25   0x02000000
#define BIT26   0x04000000
#define BIT27   0x08000000
#define BIT28   0x10000000
#define BIT29   0x20000000
#define BIT30   0x40000000
#define BIT31   0x80000000

#define nGRFWIB             32      // GRF register width in byte
#define nGRFWIW             16      // GRF register width in word
#define nGRFWID             8       // GRF register width in dword

#define nTOP_FIELD          0
#define nBOTTOM_FIELD       1

#define nPREVIOUS_FRAME     0       // Previous frame
#define nCURRENT_FRAME      1       // Current frame
#define nNEXT_FRAME         2       // Next frame

#ifdef GT
// GT DI Kernel
#else // ILK
// ILK DI Kernel
#endif

//===================================

//========== Macros ==========
#define REGION(Width,HStride) <Width*HStride;Width,HStride> // Region definition when ExecSize = Width

#define RegFile(a) a
#define REG(r,n) _REG(RegFile(r),n)
#define _REG(r,n) __REG(r,n)
#define __REG(r,n) r##n.0
#define REG2(r,n,s) _REG2(RegFile(r),n,s)
#define _REG2(r,n,s) __REG2(r,n,s)
#define __REG2(r,n,s) r##n.##s

#define dNULLREG     null<1>:d
#define wNULLREG     null<1>:w
    
#define KERNEL_ID(kernel_ID)    mov NULLREG kernel_ID:ud


#define NODDCLR 			
#define NODDCLR_NODDCHK 	
#define NODDCHK			    

//#define NODDCLR 			{ NoDDClr }
//#define NODDCLR_NODDCHK 	{ NoDDClr, NoDDChk }
//#define NODDCHK				{ NoDDChk } 


//========== Defines ====================


//========== GRF partition ==========
// r0 header            :   r0          (1 GRF)
// Static parameters    :   r1 - r5     (5 GRFS)
// Inline parameters    :   r6 - r7     (2 GRFs)
// MSGSRC               :   r9          (1 GRF)
// Top IO region        :   r10 - r33   (24 GRFS 8 for each component Y,U,V 16X8:w)
// Free space           :   r34 - r55   (22 GRFS)
// Bottom IO region     :   r56 - r79   (24 GRFS 8 for each component Y,U,V 16X8:w) 
//===================================


//========== Static Parameters ==========
// r1
#define fPROCAMP_C0             r1.0    // DWORD 0, Procamp constant C0 in :f
#define wPROCAMP_C0             r1.0    // DWORD 0, Procamp constant C0 in :w
#define NUMBER_0002							r1.1		// DWORD 0, 0x0002 used in procamp for GT
#define udCP_MessageFormat      r1.0    // DWORD 0, bits 2:3 of DWORD. (CE)
#define udCP_StatePointer       r1.0    // DWORD 0, bits 31:5 of DWORD.(CE)

#define ubSRC_CF_OFFSET         r1.4    // DWORD 1, byte 0-2. SRC packed color format YUV offset in :ub

#define ubDEST_RGB_FORMAT        r1.8    // DWORD 2, byte 0. Dest RGB color format (0:ARGB FF:XRGB)
#define ubDEST_CF_OFFSET        r1.8    // DWORD 2, byte 0-2. Dest packed color format YUV offset in :ub

#define fPROCAMP_C1             r1.3    // DWORD 3, Procamp constant C1 in :f   
#define wPROCAMP_C1             r1.6    // DWORD 3, Procamp constant C1 in :w   
#define NUMBER_0100							r1.7		// DWORD 3, 0x0100 used in procamp for GT

#define fPROCAMP_C2             r1.4    // DWORD 4, Procamp constant C2 in :f
#define wPROCAMP_C2             r1.8    // DWORD 4, Procamp constant C2 in :w

#define uwSPITCH_DIV2           r1.10   // DWORD 5, byte 0-1. statistics surface pitch divided by 2

#define fVIDEO_STEP_Y           r1.6    // DWORD 6, :f, AVS normalized reciprocal of Y Scaling factor
#define ubSTMM_SHIFT            r1.24   // DWORD 6, byte 0. Amount of right shift for the DI blending equation
#define ubSTMM_MIN              r1.25   // DWORD 6, byte 1. Min STMM for DI blending equation
#define ubSTMM_MAX              r1.26   // DWORD 6, byte 2. Max STMM for DI blending equation
#define ubTFLD_FIRST            r1.27   // DWORD 6, byte 3. Field parity order

#define fPROCAMP_C5             r1.7    // DWORD 7, Procamp constant C3 in :f
#define wPROCAMP_C5             r1.14   // DWORD 7, Procamp constant C3 in :w

// r2
#define fPROCAMP_C3             r2.0    // DWORD 0, Procamp constant C4 in :f
#define wPROCAMP_C3             r2.0    // DWORD 0, Procamp constant C4 in :w
                    
#define fCSC_C5					r2.2	// DWORD 2. WG+CSC constant C5
#define wCSC_C5					r2.4	// DWORD 2. WG+CSC constant C5

#define fPROCAMP_C4             r2.3    // DWORD 3, Procamp constant C5 in :f
#define wPROCAMP_C4             r2.6    // DWORD 3, Procamp constant C5 in :w

#define fCSC_C8					r2.4	// DWORD 4. WG+CSC constant C8
#define wCSC_C8					r2.8	// DWORD 4. WG+CSC constant C8
#define fCSC_C9					r2.7	// DWORD 7. WG+CSC constant C9
#define wCSC_C9					r2.14	// DWORD 7. WG+CSC constant C9

// r3
#define fCSC_C0					r3.0	// DWORD 0. WG+CSC constant C0
#define wCSC_C0					r3.0	// DWORD 0. WG+CSC constant C0

#define fSCALING_STEP_RATIO     r3.1    // DWORD 1, = Alpha_X_Scaling_Step / Video_X_scaling_Step :f (blending)
#define fALPHA_STEP_X           r3.1    // DWORD 1, = 1/Scale X, 0.5 = 2x, in :f (blending)

#define fALPHA_STEP_Y           r3.2    // DWORD 2, = 1/Scale Y, in :f

#define fCSC_C4					r3.3	// DWORD 3. WG+CSC constant C4
#define wCSC_C4					r3.6	// DWORD 3. WG+CSC constant C4
#define fCSC_C1					r3.4	// DWORD 4. WG+CSC constant C1
#define wCSC_C1					r3.8	// DWORD 4. WG+CSC constant C1

#define wSRC_H_ORI_OFFSET       r3.10   // DWORD 5, bytes 0,1 :w    
#define wSRC_V_ORI_OFFSET       r3.11   // DWORD 5, bytes 2,3 :w

#define dCOLOR_PIXEL            r3.6    // DWORD 6. Color pixel for Colorfill

#define fCSC_C2					r3.6	// DWORD 6. WG+CSC constant C2
#define wCSC_C2					r3.12	// DWORD 6. WG+CSC constant C2
#define fCSC_C3					r3.7	// DWORD 7. WG+CSC constant C3
#define wCSC_C3					r3.14	// DWORD 7. WG+CSC constant C3

// r4
#define fCSC_C6					r4.0	// DWORD 0. WG+CSC constant C6
#define wCSC_C6					r4.0	// DWORD 0. WG+CSC constant C6

#define wFRAME_ENDX             r4.2    // DWORD 1, word 0. Horizontal end = Origin+Width (in pixels)(for multiple blocks)
#define wNUM_BLKS               r4.3    // DWORD 1, word 1. Number of blocks to process (for multiple blocks)

#define wCOPY_ORIX              r4.5    // DWORD 2, word 1. A copy of X origin (for multiple blocks)
#define uwNLAS_ENABLE           r4.4    // DWORD 2, bit 15, NLAS enble bit

#define fCSC_C7					r4.3	// DWORD 3. WG+CSC constant C7
#define wCSC_C7					r4.6	// DWORD 3. WG+CSC constant C7
#define fCSC_C10				r4.4	// DWORD 4. WG+CSC constant C10
#define wCSC_C10				r4.8	// DWORD 4. WG+CSC constant C10

#define fFRAME_VID_ORIX         r4.5    // DWORD 5, Frame horizontal origin normalized for scale kernel

#define fFRAME_ALPHA_ORIX       r4.6    // DWORD 6. Normalized alpha horiz origin for the frame

#define fCSC_C11				r4.7	// DWORD 7. WG+CSC constant C11
#define wCSC_C11				r4.14	// DWORD 7. WG+CSC constant C11

//========================================

//========== Inline parameters ===========
// r5
#define wORIX                   r5.0    // DWORD 0, byte 0-1. :w, Destination Block Horizontal Origin in pel
#define wORIY                   r5.1    // DWORD 0, byte 2-3. :w, Destination Block Vertical   Origin in pel

#define fSRC_VID_H_ORI          r5.1    // DWORD 1, :f, SRC Y horizontal origin normalized for scale kernel

#define fSRC_VID_V_ORI          r5.2    // DWORD 2, :f, SRC Y vertical origin normalized for scale kernel

#define fSRC_ALPHA_H_ORI        r5.3    // DWORD 3, :f, Normalized alpha horizontal origin

#define fSRC_ALPHA_V_ORI        r5.4    // DWORD 4, :f, Normalized alpha vertical origin

#define uwALPHA_MASK_X          r5.10   // DWORD 5, byte 0-1 :w, H. alpha mask
#define ubALPHA_MASK_Y          r5.22   // DWORD 5, byte 2.  :ub,V. alpha mask
#define ubBLK_CNT_X             r5.23   // DWORD 5, byte 3,  :ub, Horizontal Block Count per thread

// mask is used for each block. it will be reloaded from r6 below for the last block.
#define udBLOCK_MASK            r5.6    // DWORD 6
#define uwBLOCK_MASK_H          r5.12   // DWORD 6, byte 0-1 :uw, Block horizontal mask used in non-DWord aligned kernels
#define ubBLOCK_MASK_V          r5.26   // DWORD 6, byte 2   :ub, Block vertical mask used in non-DWord aligned kernels
#define ubNUM_BLKS              r5.27   // DWORD 6, byte 3,  :ub, Total Block Count per thread

#define fVIDEO_STEP_X           r5.7    // DWORD 7. :f, AVS normalized reciprocal of X Scaling factor

// r6
#define fVIDEO_STEP_DELTA       r6.0    // DWORD 0. :f, AVS normalized delta between 2 adjacent scaling steps (used for non-linear scaling)

// mask is used for the last block (assume only M*1 and 1*N block partation aer supported)
#define udBLOCK_MASK_2            r6.1    // DWORD 1
#define uwBLOCK_MASK_H_RIGHT      r6.2    // DWORD 1, byte 0-1 :uw, Block horizontal mask used in non-DWord aligned kernels (right)
#define ubBLOCK_MASK_V_BOTTOM     r6.6    // DWORD 1, byte 2   :ub, Block vertical mask used in non-DWord aligned kernels
#define uwBLOCK_MASK_H_MIDDLE     r6.4    // DWORD 2, byte 0-1 :uw, Block horizontal mask used in non-DWord aligned kernels (left)


//====================== Binding table =========================================

#if defined(DNDI)
    // DNDI Surface Binding Table
    //#define nBI_SRC_CURR        0       // Current input frame surface
    //#define nBI_SRC_PRIV        1       // Denoised previous input frame surface
    //#define nBI_SRC_STAT        2       // Statistics input surface (STMM / Noise motion history)
    //#define nBI_DEST_1ST        3       // 1st deinterlaced output frame surface
//    #define nBI_DEST_YUV        3       // Dest frame YUV (for DN only)
    //#define nBI_DEST_Y          3       // Dest frame Y (for DN only)
    //#define nBI_DEST_2ND        4       // 2nd deinterlaced output frame surface
    //#define nBI_DEST_DN_CURR    6       // Denoised current output frame surface
    //#define nBI_DEST_STAT       7       // Statistics output surface (STMM / Noise motion history)
//    #define nBI_DEST_U          8       // Dest frame U (for DN only)
//    #define nBI_DEST_V          9       // Dest frame V (for DN only)
//    #define nBI_SRC_U          10       // Src frame U (for DN only)
//    #define nBI_SRC_V          11       // Src frame V (for DN only)
//    #define nBI_SRC_UV         10       // Current src frame for UV
    
#endif

#if defined(INPUT_PL3)
    // PL3 Surface Binding Table
//    #define nBI_SRC_ALPHA       0       // Alpha
//    #define nBI_SRC_Y           1       // Current src frame
//    #define nBI_SRC_U           2       // Current src frame
//    #define nBI_SRC_V           3       // Current src frame
//    #define nBI_DEST_Y         10       // Dest frame
//    #define nBI_DEST_U         11       // Dest frame
//    #define nBI_DEST_V         12       // Dest frame
//    #define nBI_DEST_YUV        7       // Dest frame
//    #define nBI_DEST_RGB        7       // same num as BI_DEST_YUV, never used at the same time
#endif

#if defined(INPUT_PL2)
    // PL2 Surface Binding Table
//    #define nBI_SRC_ALPHA       0       // Alpha
//    #define nBI_SRC_Y           1       // Current src frame for Y + offseted UV
//    #define nBI_SRC_YUV         1       // Current src frame for YUV in case of NV12_AVS
//    #define nBI_SRC_UV          2       // Current src frame for UV
//    #define nBI_DEST_YUV        7       // Current dest frame for Y + offseted UV
//    #define nBI_DEST_RGB        7       // same num as BI_DEST_YUV, never used at the same time
//    #define nBI_DEST_Y         10       // Dest frame
//    #define nBI_DEST_U         11       // Dest frame
//    #define nBI_DEST_V         12       // Dest frame
#endif

#if defined(INPUT_PA) || defined(COLORFILL)
    // Packed Surface Binding Table 
//    #define nBI_SRC_ALPHA       0       // Alpha    
//    #define nBI_SRC_YUV         1       // Current src frame
//    #define nBI_DEST_YUV        3       // Dest frame
//    #define nBI_DEST_RGB        3       // same num as BI_DEST_YUV, never used at the same time
#endif


//supper binding table
#define nBI_ALPHA_SRC                   0
#define nBI_CURRENT_SRC_YUV             1 
#define nBI_FIELD_COPY_SRC_1_YUV        1 
#define nBI_CURRENT_SRC_Y               1 
#define nBI_FIELD_COPY_SRC_1_Y          1
#define nBI_CURRENT_SRC_RGB             1  
#define nBI_CURRENT_SRC_UV              2 
#define nBI_FIELD_COPY_SRC_1_UV         2 
#define nBI_CURRENT_SRC_U               2 
#define nBI_FIELD_COPY_SRC_1_U          2 
#define nBI_CURRENT_SRC_V               3
#define nBI_FIELD_COPY_SRC_1_V          3 
#define nBI_TEMPORAL_REFERENCE_YUV      4 
#define nBI_FIELD_COPY_SRC_2_YUV        4 
#define nBI_TEMPORAL_REFERENCE_Y        4 
#define nBI_FIELD_COPY_SRC_2_Y          4 
#define nBI_CURRENT_SRC_YUV_HW_DI       4 
#define nBI_TEMPORAL_REFERENCE_UV       5 
#define nBI_FIELD_COPY_SRC_2_UV         5 
#define nBI_TEMPORAL_REFERENCE_U        5 
#define nBI_FIELD_COPY_SRC_2_U          5 
#define nBI_DENOISED_PREV_HW_DI         5 
#define nBI_TEMPORAL_REFERENCE_V        6 
#define nBI_FIELD_COPY_SRC_2_V          6 
#define nBI_STMM_HISTORY                6 
#define nBI_DESTINATION_YUV             7
#define nBI_DESTINATION_RGB             7
#define nBI_DESTINATION_Y               7
#define nBI_DESTINATION_UV              8
#define nBI_DESTINATION_U               8
#define nBI_DESTINATION_V               9
#define nBI_DESTINATION_1_YUV           10
#define nBI_DESTINATION_1_Y             10
#define nBI_DESTINATION_1_UV            11
#define nBI_DESTINATION_1_U             11
#define nBI_DESTINATION_1_V             12
#define nBI_DESTINATION_2_YUV           13
#define nBI_DESTINATION_2_Y             13
#define nBI_DESTINATION_2_UV            14
#define nBI_DESTINATION_2_U             14
#define nBI_DESTINATION_2_V             15
#define nBI_STMM_HISTORY_OUTPUT         20
#define nBI_TEMPORAL_REFERENCE_YUV_PDI  21 
#define nBI_TEMPORAL_REFERENCE_Y_PDI    21 
#define nBI_TEMPORAL_REFERENCE_UV_PDI   22 
#define nBI_TEMPORAL_REFERENCE_U_PDI    22 
#define nBI_TEMPORAL_REFERENCE_V_PDI    23 
#define nBI_SUBVIDEO_YUV                26
#define nBI_SUBVIDEO_Y                  26
#define nBI_SUBVIDEO_UV                 27
#define nBI_SUBVIDEO_U                  27
#define nBI_SUBVIDEO_V                  28
#define nBI_SUBPICTURE_YUV              29
#define nBI_SUBPICTURE_P8               29
#define nBI_SUBPICTURE_A8               30
#define nBI_GRAPHIC_YUV                 31
#define nBI_GRAPHIC_P8                  31
#define nBI_GRAPHIC_A8                  32



//========== Planar Sampler State Table Index ==========
#define nSI_SRC_ALPHA           0x000   // Sampler State for Alpha

//Sampler Index for AVS/IEF messages
#define nSI_SRC_Y               0x400   // Sampler State for Y
#define nSI_SRC_U               0x800   // Sampler State for U
#define nSI_SRC_V               0xC00   // Sampler State for V
#define nSI_SRC_UV              0x800   // For NV12 surfaces
#define nSI_SRC_YUV             0x400   // For Packed surfaces  
#define nSI_SRC_RGB             0x400   // For ARGB surfaces

//Sampler Index for SIMD16 sampler messages
#define nSI_SRC_SIMD16_Y        0x100   // Sampler State for Y
#define nSI_SRC_SIMD16_U        0x200   // Sampler State for U
#define nSI_SRC_SIMD16_V        0x300   // Sampler State for V
#define nSI_SRC_SIMD16_UV       0x200   // For NV12 surfaces
#define nSI_SRC_SIMD16_YUV      0x100   // For Packed surfaces  
#define nSI_SRC_SIMD16_RGB      0x100   // For ARGB surfaces



// Common Registers
#define pCF_Y_OFFSET            a0.4    // Address register holding Y offset
#define pCF_U_OFFSET            a0.5    // Address register holding U offset
#define pCF_V_OFFSET            a0.6    // Address register holding V offset

// #define YUV_ORI             ORIX    // Used by writing packed data to dport


//================= Message Payload Header fields ==============================
#define IDP     r0.2:ud     // Interface Descriptor Pointer

//================= Common Message Descriptor  TBD add common load and save =====
// Message descriptor for dataport media write
#ifdef GT
        // Message Descriptors
                //                = 000 0001 (min message len 1 - add later) 00000 (resp len 0)         
                //                  1 (header present 1) 0 0 1010 (media block write) 00000
                //                  00000000 (binding table index - set later)
                //                = 0x02094000
        #define nDPMW_MSGDSC      0x02094000
        #define nDPMR_MSGDSC      0x02098000  // Data Port Media Block Read Message Descriptor
        // TBD
#else // ILK
        // Message Descriptors
                //                = 000 0001 (min message len 1 - add later) 00000 (resp len 0)         
                //                  1 (header present 1) 000 0 010 (media block write) 0000
                //                  00000000 (binding table index - set later)
                //                = 0x02082000
        #define nDPMW_MSGDSC      0x02082000  // Data Port Media Block Write Message Descriptor
        #define nDPMR_MSGDSC      0x0208A000  // Data Port Media Block Read Message Descriptor
#endif

// Message Length defines
#define nMSGLEN_1      0x02000000 // Message Length of 1 GRF for Send
#define nMSGLEN_2      0x04000000 // Message Length of 2 GRF for Send
#define nMSGLEN_4      0x08000000 // Message Length of 4 GRF for Send
#define nMSGLEN_8      0x10000000 // Message Length of 8 GRF for Send

// Response Length defines
#define nRESLEN_1      0x00100000 // Message Response Length of 1  GRF from Send
#define nRESLEN_2      0x00200000 // Message Response Length of 2  GRF from Send
#define nRESLEN_3      0x00300000 // Message Response Length of 3  GRF from Send
#define nRESLEN_4      0x00400000 // Message Response Length of 4  GRF from Send
#define nRESLEN_5      0x00500000 // Message Response Length of 5  GRF from Send
#define nRESLEN_8      0x00800000 // Message Response Length of 8  GRF from Send
#define nRESLEN_9      0x00900000 // Message Response Length of 9  GRF from Send
#define nRESLEN_11     0x00B00000 // Message Response Length of 11 GRF from Send
#define nRESLEN_12     0x00C00000 // Message Response Length of 12 GRF from Send
#define nRESLEN_16     0x01000000 // Message Response Length of 16 GRF from Send

// Block Width and Height Size defines
#define nBLOCK_WIDTH_4   0x00000003      // Block Width  4
#define nBLOCK_WIDTH_5   0x00000004      // Block Width  5
#define nBLOCK_WIDTH_8   0x00000007      // Block Width  8
#define nBLOCK_WIDTH_9   0x00000008      // Block Width  9
#define nBLOCK_WIDTH_12  0x0000000B      // Block Width  12
#define nBLOCK_WIDTH_16  0x0000000F      // Block Width  16
#define nBLOCK_WIDTH_20  0x00000013      // Block Width  20
#define nBLOCK_WIDTH_32  0x0000001F      // Block Width  32
#define nBLOCK_HEIGHT_1  0x00000000      // Block Height 1
#define nBLOCK_HEIGHT_2  0x00010000      // Block Height 2
#define nBLOCK_HEIGHT_4  0x00030000      // Block Height 4
#define nBLOCK_HEIGHT_5  0x00040000      // Block Height 5
#define nBLOCK_HEIGHT_8  0x00070000      // Block Height 8

// Extended Message Descriptors
#define nEXTENDED_MATH      0x1
#define nSMPL_ENGINE        0x2
#define nMESSAGE_GATEWAY    0x3
#define nDATAPORT_READ      0x4
#define nDATAPORT_WRITE     0x5
#define nURB                0x6
#define nTS_EOT             0x27    // with End-Of-Thread bit ON

// Common message descriptors:
#ifdef GT
	#define nEOT_MSGDSC       0x02000010  // End of Thread Message Descriptor    
	#define IF_NULL						null:uw null:uw null:uw 	//for different if instructions on ILK and Gen6
#else //ILK
	#define nEOT_MSGDSC       0x02000000  // End of Thread Message Descriptor        
	#define IF_NULL
#endif    


//===================== Math Function Control ===================================
#define mfcINV                  0x1     // reciprocal
#define mfcLOG                  0x2     // log
#define mfcEXP                  0x3     // exponent
#define mfcSQRT                 0x4     // square root
#define mfcRSQ                  0x5     // reciprocal square root
#define mfcSIN                  0x6     // sine (in radians)
#define mfcCOS                  0x7     // cosine (in radians)
#define mfcSINCOS               0x8     // dst0 = sin of src0, dst1 = cosine of src0 (in radians) - GT+ ONLY
#define mfcPOW                  0xA     // abs(src0) raised to the src1 power    
#define mfcINT_DIV_QR           0xB     // return quotient and remainder
#define mfcINT_DIV_Q            0xC     // return quotient
#define mfcINT_DIV_R            0xD     // return remainder


//=================== Message related registers =================================

#ifdef GT
        #define udDUMMY_NULL
#else   // _ILK
        #define udDUMMY_NULL    null:ud         // Used in send inst as src0
#endif

 
//----------- Message Registers ------------
#define mMSGHDR      m1     // Message Payload Header
#define mMSGHDRY     m1     // Message Payload Header register for Y data
#define mMSGHDRU     m2     // Message Payload Header register for U data
#define mMSGHDRV     m3     // Message Payload Header register for V data
#define mMSGHDRYA    m4     // Second Message Payload Header register for Y data
#define mMSGHDRH     m5     // Message Payload Header register for motion history
#define mMSGHDRY1    m1     // Message Payload Header register for first  Y data
#define mMSGHDRY2    m2     // Message Payload Header register for second Y data
#define mMSGHDRY3    m3     // Message Payload Header register for third  Y data
#define mMSGHDRY4    m4     // Message Payload Header register for fourth Y data
#define mMSGHDRY5    m5     // Message Payload Header register for fifth Y data
#define mMSGHDRY6    m6     // Message Payload Header register for sixth Y data
#define mMSGHDR_EOT  m15    // Dummy Message Register for EOT

#define rMSGSRC     r8      // Message source register
#define pMSGDSC     a0.0:ud // Message Descriptor register (type DWORD)

#define udMH_ORI    rMSGSRC.0   // Data Port Media Block R/W message header block offset
#define udMH_ORIX   rMSGSRC.0   // Data Port Media Block R/W message header X offset
#define udMH_ORIY   rMSGSRC.1   // Data Port Media Block R/W message header Y offset
#define udMH_SIZE   rMSGSRC.2   // Data Port Media Block R/W message header block width & height

//  M2 - M9 for message data payload
.declare    mubMSGPAYLOAD  Base=m2 ElementSize=1 SrcRegion=REGION(16,1) Type=ub
.declare    muwMSGPAYLOAD  Base=m2 ElementSize=2 SrcRegion=REGION(16,1) Type=uw
.declare    mudMSGPAYLOAD  Base=m2 ElementSize=4 SrcRegion=REGION(8,1) Type=ud
.declare    mfMSGPAYLOAD   Base=m2 ElementSize=4 SrcRegion=REGION(8,1) Type=f

//=================== End of thread instruction ===========================
#ifdef GT
	#define END_THREAD          mov  (8) mMSGHDR_EOT<1>:ud    r0.0<8;8,1>:ud \n\
								send (1) null<1>:d mMSGHDR_EOT nTS_EOT nEOT_MSGDSC 
#else   // ILK				This should be changed to 1 instruction; I have tested it and it works - vK
	#define END_THREAD          mov  (8) mMSGHDR_EOT<1>:ud    r0.0<8;8,1>:ud \n\
                            	send (1) dNULLREG mMSGHDR_EOT udDUMMY_NULL  nTS_EOT nEOT_MSGDSC:ud
#endif


//=======================================================================
// Region declarations for SRC and DEST as TOP and BOT

// Common I/O regions
#define nREGION_1       1
#define nREGION_2       2

//*** These region base GRFs are fixed regardless planar/packed, and data alignment.
//*** Each kernel is responsible to select the correct region declaration below.
//*** YUV regions are not necessarily next to each other.
#define nTOP_Y          10      // r10 - r17  (8 GRFs)
#define nTOP_U          18      // r18 - r25 (8 GRFs)
#define nTOP_V          26      // r26 - r33 (8 GRFs)

#define nBOT_Y          56      // r56 - r63 (8 GRFs)
#define nBOT_U          64      // r64 - r71 (8 GRFs)
#define nBOT_V          72      // r72 - r79 (8 GRFs)

// Define temp space for any usages
#define nTEMP0          34
#define nTEMP1          35
#define nTEMP2          36
#define nTEMP3          37
#define nTEMP4          38
#define nTEMP5          39
#define nTEMP6          40
#define nTEMP7          41
#define nTEMP8          42
#define nTEMP10         44
#define nTEMP12         46
#define nTEMP14         48
#define nTEMP16         50
#define nTEMP17         51
#define nTEMP18         52

#define nTEMP24			58

// Common region 1
.declare ubTOP_Y        Base=REG(r,nTOP_Y) ElementSize=1 SrcRegion=REGION(16,1) DstRegion=<1> Type=ub
.declare ubTOP_U        Base=REG(r,nTOP_U) ElementSize=1 SrcRegion=REGION(8,1) DstRegion=<1> Type=ub
.declare ubTOP_V        Base=REG(r,nTOP_V) ElementSize=1 SrcRegion=REGION(8,1) DstRegion=<1> Type=ub
                        
.declare uwTOP_Y        Base=REG(r,nTOP_Y) ElementSize=2 SrcRegion=REGION(16,1) DstRegion=<1> Type=uw
.declare uwTOP_U        Base=REG(r,nTOP_U) ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
.declare uwTOP_V        Base=REG(r,nTOP_V) ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
.declare ub2TOP_Y       Base=REG(r,nTOP_Y) ElementSize=1 SrcRegion=REGION(16,2) DstRegion=<1> Type=ub
.declare ub2TOP_U       Base=REG(r,nTOP_U) ElementSize=1 SrcRegion=REGION(8,2) DstRegion=<1> Type=ub
.declare ub2TOP_V       Base=REG(r,nTOP_V) ElementSize=1 SrcRegion=REGION(8,2) DstRegion=<1> Type=ub

.declare ub4TOP_Y       Base=REG(r,nTOP_Y) ElementSize=1 SrcRegion=REGION(8,4) Type=ub
.declare ub4TOP_U       Base=REG(r,nTOP_U) ElementSize=1 SrcRegion=REGION(8,4) Type=ub
.declare ub4TOP_V       Base=REG(r,nTOP_V) ElementSize=1 SrcRegion=REGION(8,4) Type=ub

.declare ubTOP_ARGB     Base=REG(r,nTOP_Y) ElementSize=1 SrcRegion=REGION(8,4) Type=ub

// Used by "send" instruction
.declare udTOP_Y_IO     Base=REG(r,nTOP_Y) ElementSize=4 SrcRegion=REGION(8,1) Type=ud
.declare udTOP_U_IO     Base=REG(r,nTOP_U) ElementSize=4 SrcRegion=REGION(8,1) Type=ud
.declare udTOP_V_IO     Base=REG(r,nTOP_V) ElementSize=4 SrcRegion=REGION(8,1) Type=ud

// Common region 2
.declare ubBOT_Y        Base=REG(r,nBOT_Y) ElementSize=1 SrcRegion=REGION(16,1) DstRegion=<1> Type=ub
.declare ubBOT_U        Base=REG(r,nBOT_U) ElementSize=1 SrcRegion=REGION(8,1) DstRegion=<1> Type=ub
.declare ubBOT_V        Base=REG(r,nBOT_V) ElementSize=1 SrcRegion=REGION(8,1) DstRegion=<1> Type=ub
                        
.declare uwBOT_Y        Base=REG(r,nBOT_Y) ElementSize=2 SrcRegion=REGION(16,1) DstRegion=<1> Type=uw
.declare uwBOT_U        Base=REG(r,nBOT_U) ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
.declare uwBOT_V        Base=REG(r,nBOT_V) ElementSize=2 SrcRegion=REGION(8,1) DstRegion=<1> Type=uw
.declare ub2BOT_Y       Base=REG(r,nBOT_Y) ElementSize=1 SrcRegion=REGION(16,2) DstRegion=<1> Type=ub
.declare ub2BOT_U       Base=REG(r,nBOT_U) ElementSize=1 SrcRegion=REGION(8,2) DstRegion=<1> Type=ub
.declare ub2BOT_V       Base=REG(r,nBOT_V) ElementSize=1 SrcRegion=REGION(8,2) DstRegion=<1> Type=ub

.declare ubBOT_ARGB     Base=REG(r,nBOT_Y) ElementSize=1 SrcRegion=REGION(8,4) Type=ub

// Used by "send" instruction
.declare udBOT_Y_IO     Base=REG(r,nBOT_Y) ElementSize=4 SrcRegion=REGION(8,1) Type=ud
.declare udBOT_U_IO     Base=REG(r,nBOT_U) ElementSize=4 SrcRegion=REGION(8,1) Type=ud
.declare udBOT_V_IO     Base=REG(r,nBOT_V) ElementSize=4 SrcRegion=REGION(8,1) Type=ud

// End of common.inc

#endif    // COMMON_INC
